clear all
set more off
set matsize 10000
set maxvar 10000
adopath + ../code/gslab_tools/

preliminaries, doutf(../derived/NCT_registry/)
graph set window fontface "Times New Roman"


/***********
** Flag papers that can be linked to NCT registry **
*********/

use studyname studyid study_no anti_sample drug* no_randomised* sponsor* ///
	year* using "../derived/Combined/combined_arm_level_wide.dta", clear
drop drug_type*
	
gen nct_id = substr(studyname,strpos(studyname,"NCT"),11)
gen nct_studyid = substr(studyid,strpos(studyid,"NCT"),11)
replace nct_id = nct_studyid if nct_id ==""
drop nct_studyid

*Manually Add Missing NCTs
*Search for each trial in NCT registry
replace nct_id="NCT01148472" if studyname=="Wade 2007 (Study 10990)"

gen linked = nct_id !=""
tab linked anti_sample

gen published = year!=.
egen max_sponsor = rowmax(sponsor1 sponsor2 sponsor3 sponsor4 sponsor5 sponsor6 sponsor7)
tab linked max_sponsor

tab max_sponsor published if linked & anti_sample==1, row
save ../derived/NCT_registry/registry_link.dta, replace

/******
Get registry denominator
****/

import delimited "../raw/NCT_Registry/data/studies.txt", ///
	delimiter("|") clear
keep nct_id study_first_submitted_date start_date start_date_type completion_date ///
	completion_date_type enrollment enrollment_type source brief_title phase
format source %50s
format brief_title %50s
save ../derived/NCT_registry/studies.dta, replace

***	
* Get drug list to merge in
***
use "../derived/Combined/combined_arm_level.dta", clear
keep drug
duplicates drop
drop if drug=="placebo"
rename drug drug_name
levelsof drug_name, local(drug_names)
save ../derived/NCT_registry/drug_list.dta, replace

***	
* Get Author proxy
***
import delimited "../raw/NCT_Registry/data/responsible_parties.txt", ///
	delimiter("|") clear
format name %50s
format affiliation %50s
rename name responsible_name
rename affiliation responsible_affiliation
keep nct_id responsible*
save ../derived/NCT_registry/responsible_party.dta, replace

***
* List of drugs / interventions
**	
import delimited "../raw/NCT_Registry/data/interventions.txt", ///
	delimiter("|") clear
keep nct_id intervention_type name
rename name drug_name
format drug_name %50s
keep if intervention_type=="Drug"
replace drug_name = lower(drug_name)
replace drug_name = strtrim(drug_name)
gen keep = 0
foreach drug_name in `drug_names' "dvs" {
	replace keep = 1 if strpos(drug_name, "`drug_name'")!=0
}
*Exclude combination drugs
gen exclude = strpos(drug_name,"+")!=0
replace exclude = 1 if strpos(drug_name,"combination")!=0

bys nct_id: egen keep_all = max(keep)
keep if keep_all
bys nct_id: egen exclude_all=max(exclude)
drop if exclude_all

drop keep* exclude*
bys nct_id: gen count =_n
drop if count>7
reshape wide drug_name, i(nct_id) j(count)
save ../derived/NCT_registry/interventions.dta, replace

	
import delimited "../raw/NCT_Registry/data/conditions.txt", ///
	delimiter("|") clear
gen keep = strpos(downcase_name, "major depressive disorder")!=0
replace keep = 1 if strpos(downcase_name, "depressive disorder, major")!=0
replace keep = 1 if strpos(downcase_name, "depressive")!=0
replace keep = 1 if strpos(downcase_name, "depression")!=0
keep if keep
bys nct_id: keep if _n==1
keep nct_id downcase_name
format downcase_name %50s
save ../derived/NCT_registry/conditions.dta, replace

import delimited "../raw/NCT_Registry/data/study_references.txt", ///
	delimiter("|") clear
keep if reference_type=="results_reference"	
bys nct_id: keep if _n==1
keep nct_id pmid
save ../derived/NCT_registry/references.dta, replace

import delimited "../raw/NCT_Registry/data/sponsors.txt", ///
	delimiter("|") clear
keep if lead_or_collaborator =="lead"
keep nct_id agency_class name
format name %50s
rename name sponsor_name
save ../derived/NCT_registry/sponsors.dta, replace

import delimited "../raw/NCT_Registry/data/designs.txt", ///
	delimiter("|") clear
keep nct_id allocation primary_purpose intervention_model
keep if allocation=="Randomized" 
keep if primary_purpose=="Basic Science" | primary_purpose=="Treatment"
keep if intervention_model=="Parallel Assignment" | intervention_model==""
save ../derived/NCT_registry/designs.dta, replace

import delimited "../raw/NCT_Registry/data/eligibilities.txt", ///
	delimiter("|" "~") clear
drop if inlist(maximum_age, "11 Years", "17 Years", "18 Years")
drop if inlist(minimum_age, "6 Years", "7 Years", "8 Years", "9 Years", "10 Years", "11 Years", "12 Years")
drop if inlist(minimum_age, "13 Years", "14 Years", "15 Years", "16 Years", "17 Years")
drop if healthy_volunteers =="Accepts Healthy Volunteers"
save ../derived/NCT_registry/eligible.dta, replace

***
* Merge
***

use ../derived/NCT_registry/studies.dta, clear
merge 1:1 nct_id using ../derived/NCT_registry/conditions.dta, assert(1 3) keep(3) nogen
merge 1:1 nct_id using ../derived/NCT_registry/interventions.dta, assert(1 2 3) keep(3) nogen
merge 1:1 nct_id using ../derived/NCT_registry/designs.dta, assert(1 2 3) keep(3) nogen 
merge 1:1 nct_id using ../derived/NCT_registry/references.dta, assert(1 2 3) keep(1 3) nogen 
merge 1:1 nct_id using ../derived/NCT_registry/sponsors.dta, assert(1 2 3) keep(1 3) nogen 
merge 1:1 nct_id using ../derived/NCT_registry/responsible_party.dta, assert(1 2 3) keep(1 3) nogen 
merge 1:1 nct_id using ../derived/NCT_registry/eligible.dta, assert(1 2 3) keep(3) nogen 


merge 1:m nct_id using ../derived/NCT_registry/registry_link.dta, ///
	gen(registry_link) keep(1 2 3)
replace published = 1 if pmid!=.
replace published = 0 if pmid ==. & published ==.

*Drop due to inclusion criteria, chronic depression
drop if strpos(brief_title, "chronic")!=0
drop if strpos(brief_title, "Chronic")!=0
drop if strpos(brief_title, "relapse")!=0
drop if strpos(brief_title, "Relapse")!=0
drop if strpos(brief_title, "Monotherapy")!=0
drop if strpos(brief_title, "monotherapy")!=0
drop if strpos(brief_title, "Adjunctive")!=0
drop if strpos(brief_title, "adjunctivey")!=0
drop if strpos(brief_title, "Discontinuation")!=0
drop if enrollment==0

tab registry_link
save ../derived/NCT_registry/registry_all.dta, replace

* Get estimate of magnitude of effect for in-paper statistics
use ../derived/NCT_registry/registry_all.dta, clear

gen submit_year = substr(study_first_submitted,1,4)
destring submit_year, replace
gen start_year = substr(start_date,1,4)
destring start_year, replace
gen end_year = substr(completion_date,1,4)
destring end_year, replace


// Of 71 trials linked in analysis sample, 64 are in this registry sample
tab registry_link if linked & registry_link!=1

tab published if registry_link==1

gen diff = year-submit_year
summ diff, d
/// Median trial published 4 years after submission, 90th percentile published 5 years after

drop if submit_year<=2005
drop if submit_year>=2011 | submit_year==.


/// 25% in analysis sample
gen in_analysis_sample = registry_link==3 
tab in_analysis_sample



** Clean up files
cap erase ../derived/NCT_registry/conditions.dta
cap erase ../derived/NCT_registry/designs.dta
cap erase ../derived/NCT_registry/drug_list.dta
cap erase ../derived/NCT_registry/eligible.dta
cap erase ../derived/NCT_registry/interventions.dta
cap erase ../derived/NCT_registry/references.dta
cap erase ../derived/NCT_registry/registry_all.dta
cap erase ../derived/NCT_registry/responsible_party.dta
cap erase ../derived/NCT_registry/sponsors.dta
cap erase ../derived/NCT_registry/studies.dta
